package org.jeecg.modules.webcrawler.job;

import java.io.IOException;
import java.net.URLEncoder;
import java.util.Date;
import java.util.List;

import org.apache.commons.lang.StringUtils;
import org.hibernate.validator.internal.util.stereotypes.ThreadSafe;
import org.jeecg.common.util.DateUtils;
import org.jeecg.common.util.HttpRequest;
import org.jeecg.modules.webcrawler.entity.WebCrawlerWord;
import org.jeecg.modules.webcrawler.util.WebCrawlerCacheUtils;
import org.jeecg.modules.webcrawler.util.bloomfilter.BloomFilterHelper;
import org.jeecg.modules.webcrawler.util.bloomfilter.BloomRedisService;
import org.jsoup.Connection;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
import org.quartz.Job;
import org.quartz.JobExecutionContext;
import org.quartz.JobExecutionException;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.web.util.HtmlUtils;

import com.alibaba.fastjson.JSONArray;
import com.alibaba.fastjson.JSONObject;

import lombok.extern.slf4j.Slf4j;

/**
 * 新华网
 * 
 * @author Scott
 */
@Slf4j
public class XinHuaWangJob implements Job {
	
	@Autowired
    private BloomRedisService redisService;

    @Autowired
    private BloomFilterHelper bloomFilterHelper;
	
	@Override
	public void execute(JobExecutionContext jobExecutionContext) throws JobExecutionException {
		log.info(String.format("新华网!  时间:" + DateUtils.getTimestamp()));
//		List<WebCrawlerWord> wordList = WebCrawlerCacheUtils.queryWordList();
//		for (WebCrawlerWord word : wordList) {
//			get_list(word.getTitle(), 1);
//		}
		
		
		xijinping("习近平", 1, 672);
		cctv_xijinping("习近平", 1, 672);
		sina_xijinping("习近平", 1, 672);
//		List<WebCrawlerWord> wordList = WebCrawlerCacheUtils.queryWordList();
//		for (WebCrawlerWord word : wordList) {
//			System.out.println(word.getTitle());
//			if(word.getSort() == 1) {		//公安
//				get_list(word.getTitle(), 1, 4);
//			}else if(word.getSort() == 2) {//经开区
//				get_list(word.getTitle(), 1, 104);
//			}else if(word.getSort() == 3) {//经开区
//				get_list(word.getTitle(), 1, 304);
//			}else if(word.getSort() == 667) {//王东峰
//				get_list(word.getTitle(), 1, 667);
//			}else if(word.getSort() == 668) {//高宏志
//				get_list(word.getTitle(), 1, 668);
//			}
//			
//			System.out.println("-------------------------------------");
//		}
	}
	
	
	/**
	 * 邯郸新闻网
	 */
	public  void get_list(String keyword,int page, int type){
		//整个html内容
		Document doc;
		int errcount = 0; //重复次数
		int count = 0;
		try {
			int random=(int)(Math.random()*20+1)*1000;
			Thread.sleep(random);	//
			String result = HttpRequest.sendPost("http://so.news.cn/getNews", "keyword="+keyword+"&curPage="+page+"&sortField=0&searchFields=1&lang=cn");
			if(StringUtils.isNotEmpty(result)) {
				log.info("***************************************"+keyword+"*****第"+page+"页(新华网)********************************************");
				//System.out.println(result);
				JSONObject json = JSONObject.parseObject(result);
				if(json.getIntValue("code") == 200) {
					JSONArray list = json.getJSONObject("content").getJSONArray("results");
					if(list != null && list.size() > 0) {
						count = list.size();
						for (int i = 0; i < list.size(); i++) {
							JSONObject info = list.getJSONObject(i);
							String title = HtmlUtils.htmlUnescape(info.getString("title").replace("<font color=red>", "").replace("</font>", ""));
							String url = info.getString("url");
							String time = info.getString("pubtime");
							
							if(StringUtils.isNotEmpty(url)) {		//爬虫过滤重复url
								if(redisService.includeByBloomFilter(bloomFilterHelper, "so.news.cn"+type, url)){  //url已存在
									errcount++;
								}else {
									redisService.addByBloomFilter(bloomFilterHelper, "so.news.cn"+type, url);
									if(WebCrawlerCacheUtils.getTotalCount(url, type) > 0) {
										errcount++;
										break;
									}else {
										Date date = DateUtils.str2Date(time, DateUtils.datetimeFormat);
										WebCrawlerCacheUtils.addArticle(title, url, date, type, "新华网");
									}
								}
							}
							
							log.info(title);
							log.info(url);
							log.info(time);
							log.info("----------------------------------------重复次数"+errcount+"--------------------------------------------------");
//							if(WebCrawlerCacheUtils.getTotalCount(url, 3) > 0) {
//								flag = false;  
//								break;
//							}else {
//								WebCrawlerCacheUtils.addArticle(title, url, time, 3);
//							}
						}
					}else {
						count = 0;
					}
				}
			}
			

			//查询分页列表
			page++;
			if(count > 0 && errcount < 8) {
				//get_list(keyword, page,type);
			}
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} // 设置连接超时时间 
	}
	
	
	/**
	 * 新华网
	 */
	public  void xijinping(String keyword,int page, int type){
		//整个html内容
		Document doc;
		int errcount = 0; //重复次数
		try {
			//Thread.sleep(1000);	//
			//http://www.handannews.com.cn:9088/servlet/SearchServlet.do?contentKey=%E9%82%AF%E9%83%B8&titleKey=&authorKey=&nodeNameResult=&subNodeResult=&dateFrom=&dateEnd=&sort=&op=single&siteID=&pager.offset=20&pageNo=3  "++"&pageNo="+page
			Connection conn = Jsoup.connect("http://www.xinhuanet.com/politics/leaders/xijinping/hyhd.htm").timeout(5000);
			conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			conn.header("Accept-Encoding", "gzip, deflate, sdch");
			conn.header("Accept-Language", "zh-CN,zh;q=0.8");
			conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
			doc = conn.get();
			String name = doc.getElementsByTag("title").text();
			log.info("***************************************习近平("+name+")********************************************");
			Elements tablelist = doc.select("ul.blocks-list li");
			if(!tablelist.isEmpty()) {
				
				for (Element info : tablelist) {
					String title = info.text();
					String url = info.select("a").first().attr("href");
					String time = info.attr("data-pt");
					if(StringUtils.isNotEmpty(url)) {		//爬虫过滤重复url
						if(redisService.includeByBloomFilter(bloomFilterHelper, "so.news.cn"+type, url)){  //url已存在
							errcount++;
						}else {
							redisService.addByBloomFilter(bloomFilterHelper, "so.news.cn"+type, url);
							if(WebCrawlerCacheUtils.getTotalCount(url, type) > 0) {
								errcount++;
								break;
							}else {
								Date date = DateUtils.str2Date(time, DateUtils.date_sdf);
								WebCrawlerCacheUtils.addArticle(title, url, date, type, "新华网");
							}
						}
					}
					log.info(title);
					log.info(url);
					log.info(time);
					
					log.info("----------------------------------------重复次数"+errcount+"--------------------------------------------------");
				}
			}
			

		
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} // 设置连接超时时间 
	}
	
	
	
	public  void cctv_xijinping(String keyword,int page, int type){
		//整个html内容
		Document doc;
		int errcount = 0; //重复次数
		try {
			Connection conn = Jsoup.connect("https://search.cctv.com/search.php?qtext=%E4%B9%A0%E8%BF%91%E5%B9%B3&sort=relevance&type=web&vtime=&datepid=1&c").timeout(5000);
			conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			conn.header("Accept-Encoding", "gzip, deflate, sdch");
			conn.header("Accept-Language", "zh-CN,zh;q=0.8");
			conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
			conn.header("Connection", "close");  
			conn.validateTLSCertificates(false);
			doc = conn.get();
			String name = doc.getElementsByTag("title").text();
			log.info("***************************************习近平("+name+")********************************************");
			Elements tablelist = doc.select("div.outer li");
			if(!tablelist.isEmpty()) {
				
				for (Element info : tablelist) {
					String title = info.select("a").text();
					String url = info.select("h3.tit span").attr("lanmu1");
					String time = info.select(".tim").text().replace("发布时间：", "");
					if(StringUtils.isNotEmpty(url)) {		//爬虫过滤重复url
						if(redisService.includeByBloomFilter(bloomFilterHelper, "search.cctv.com"+type, url)){  //url已存在
							errcount++;
							System.out.println("错误信息");
						}else {
							System.out.println("插入信息");
							redisService.addByBloomFilter(bloomFilterHelper, "search.cctv.com"+type, url);
							if(WebCrawlerCacheUtils.getTotalCount(url, type) > 0) {
								errcount++;
								break;
							}else {
								Date date = DateUtils.str2Date(time, DateUtils.datetimeFormat);
								WebCrawlerCacheUtils.addArticle(title, url, date, type, "CCTV央视新闻");
							}
						}
					}
					log.info(title);
					log.info(url);
					log.info(time);
					
					log.info("----------------------------------------重复次数"+errcount+"--------------------------------------------------");
				}
			}
			

		
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} // 设置连接超时时间 
	}
	
	
	public  void sina_xijinping(String keyword,int page, int type){
		//整个html内容
		Document doc;
		int errcount = 0; //重复次数
		try {
			Connection conn = Jsoup.connect("https://search.sina.com.cn/?q=%CF%B0%BD%FC%C6%BD&range=title&c=news&sort=time&col=&source=&from=&country=&size=&time=&a=&t=").timeout(5000);
			conn.header("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8");
			conn.header("Accept-Encoding", "gzip, deflate, sdch");
			conn.header("Accept-Language", "zh-CN,zh;q=0.8");
			conn.header("User-Agent", "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.87 Safari/537.36");
			conn.header("Connection", "close");  
			conn.validateTLSCertificates(false);
			doc = conn.get();
			String name = doc.getElementsByTag("title").text();
			log.info("***************************************习近平("+name+")********************************************");
			Elements tablelist = doc.getElementById("result").select(".box-result h2");
			if(!tablelist.isEmpty()) {
				
				for (Element info : tablelist) {
					String title = info.select("a").text();
					String url = info.select("a").first().attr("href");
					String time = info.select(".fgray_time").text();
					String[] into = time.split(" ");
					
					
					if(StringUtils.isNotEmpty(url)) {		//爬虫过滤重复url
						if(redisService.includeByBloomFilter(bloomFilterHelper, "search.sina.com.cn"+type, url)){  //url已存在
							errcount++;
						}else {
							redisService.addByBloomFilter(bloomFilterHelper, "search.sina.com.cn"+type, url);
							if(WebCrawlerCacheUtils.getTotalCount(url, type) > 0) {
								errcount++;
								break;
							}else {
								Date date = DateUtils.str2Date(into[1] + " " + into[2], DateUtils.datetimeFormat);
								WebCrawlerCacheUtils.addArticle(title, url, date, type, into[0]);
							}
						}
					}
					log.info(title);
					log.info(url);
					log.info(time);
					
					log.info("----------------------------------------重复次数"+errcount+"--------------------------------------------------");
				}
			}
			

		
		} catch (Exception e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		} // 设置连接超时时间 
	}
	

	public static void main(String[] args) {
		new XinHuaWangJob().cctv_xijinping("", 1, 69);
	}
}
